library(rjson)
library(stringr)
library(ggplot2)
library(R.utils)
library(multiwayvcov)
library(lmtest)
library(stm)
library(stopwords)
library(stargazer)
library(here)

###########################
###########################
# MAKE SEGMENTS
###########################
###########################

#### GET NEWS SEGMENTS ####
setwd(here("release_data","TopicModelInputs"))

### MSNBC ###

#get data from multiple pulls
msnbc0 <- fromJSON(file="msnbc_immigr_apr_sept2019.json")
msnbc1 <- fromJSON(file="msnbc_immigr_2019.json")
msnbc2 <- fromJSON(file="msnbc_immigr_2018.json")
msnbc3 <- fromJSON(file="msnbc_immigr.json")

msnbc <- c(msnbc0,msnbc1,msnbc2,msnbc3)

rm(msnbc0,msnbc1,msnbc2,msnbc3)

pb <- txtProgressBar(min = 0, max = length(msnbc), style = 3)

msnbc_segs <- NULL

for(j in 1:length(msnbc)){
  tmp <- msnbc[[j]] #each program
  
  #unlist to df
  df <- data.frame(cc=unlist(str_split(tmp$cc,"\n"))[1:length(tmp$times)],times=tmp$times)
  
  colnames(df)<- c("cc","times")
  
  #get times of immigration mentions
  df$immigr <- as.numeric(grepl("immigr|illegals|illegal alien",df$cc,ignore.case=T))
  
  times <- df$times[which(df$immigr==1)]
  
  #immigration segment = at least 2 immigration mentions w/in 60 sec
  
  start <- times[1]
  stop <- times[1]
  
  # get segment start and stop times
  segs <- NULL
  if(length(times)>1){
    for(i in 1:length(times)){
      if(times[i]-stop<=60){
        stop <- times[i]
      } else {
        segs <- rbind(segs,c(start,stop))
        start <- times[i]
        stop <- times[i]
      }
    }
    segs <- as.data.frame(rbind(segs,c(start,stop)))
    colnames(segs) <- c("start","stop")
    
    #add program metadata
    segs$start_time <- tmp$start_localtime
    segs$program <- tmp$program
    segs$title <- tmp$title
    
    #get segment text
    segs$txt <- rep(NA, nrow(segs))
    for(k in 1:nrow(segs)){
      segs$txt[k] <- paste0(df$cc[df$times>=segs$start[k]&df$times<=segs$stop[k]], collapse=" ")
    }
    
    #bind segments
    msnbc_segs <- rbind(msnbc_segs, segs)
  }
  setTxtProgressBar(pb, j) 
}

rm(msnbc)

#drop anything with fewer than 2 mentions w/in a minute
msnbc_segs <- msnbc_segs[msnbc_segs$start!=(msnbc_segs$stop),]

#was trump mentioned in the segment?
msnbc_segs$trump <- as.numeric(grepl("trump", msnbc_segs$txt, ignore.case=T))

msnbc_segs$channel <- "msnbc"
msnbc_segs$date <- as.Date(substr(msnbc_segs$start_time,1,10))

#remove any overlapping dates
msnbc_segs <- msnbc_segs[!duplicated(msnbc_segs),]

#check that we're getting the right segments
msnbc_segs$tst <- as.numeric(grepl("immigr|illegals|illegal alien", msnbc_segs$txt, ignore.case = T))

mean(msnbc_segs$tst)

#drop misses
msnbc_segs<- msnbc_segs[msnbc_segs$tst==1,]

msnbc_segs$duration <- (msnbc_segs$stop - msnbc_segs$start)/60

#average segment is .75 mins long
mean(msnbc_segs$duration)

### CNN ###

#get data from multiple pulls
cnn0 <- fromJSON(file="cnn_immigr_apr_sept2019.json")
cnn1 <- fromJSON(file="cnn_immigr_2019.json")
cnn2 <- fromJSON(file="cnn_immigr_2018.json")
cnn3 <- fromJSON(file="cnn_immigr.json")

cnn <- c(cnn0,cnn1,cnn2,cnn3)

rm(cnn0,cnn1,cnn2,cnn3)

pb <- txtProgressBar(min = 0, max = length(cnn), style = 3)

cnn_segs <- NULL

for(j in 1:length(cnn)){
  tmp <- cnn[[j]] #each program
  
  #unlist to df
  df <- data.frame(cc=unlist(str_split(tmp$cc,"\n"))[1:length(tmp$times)],times=tmp$times)
  
  colnames(df)<- c("cc","times")
  
  #get times of immigration mentions
  df$immigr <- as.numeric(grepl("immigr|illegals|illegal alien",df$cc,ignore.case=T))
  
  times <- df$times[which(df$immigr==1)]
  
  #immigration segment = at least 2 immigration mentions w/in 60 sec
  #end of segment is 60 sec after last immigration mention
  
  start <- times[1]
  stop <- times[1]
  
  # get segment start and stop times
  segs <- NULL
  if(length(times)>1){
    for(i in 1:length(times)){
      if(times[i]-stop<=60){
        stop <- times[i]
      } else {
        segs <- rbind(segs,c(start,stop))
        start <- times[i]
        stop <- times[i]
      }
    }
    segs <- as.data.frame(rbind(segs,c(start,stop)))
    colnames(segs) <- c("start","stop")
    
    #add program metadata
    segs$start_time <- tmp$start_localtime
    segs$program <- tmp$program
    segs$title <- tmp$title
    
    #get segment text
    segs$txt <- rep(NA, nrow(segs))
    for(k in 1:nrow(segs)){
      segs$txt[k] <- paste0(df$cc[df$times>=segs$start[k]&df$times<=segs$stop[k]], collapse=" ")
    }
    
    #bind segments
    cnn_segs <- rbind(cnn_segs, segs)
  }
  setTxtProgressBar(pb, j) 
}

rm(cnn)

#drop anything with fewer than 2 mentions w/in a minute
cnn_segs <- cnn_segs[cnn_segs$start!=(cnn_segs$stop),]

#was trump mentioned in the segment?
cnn_segs$trump <- as.numeric(grepl("trump", cnn_segs$txt, ignore.case=T))

cnn_segs$channel <- "cnn"
cnn_segs$date <- as.Date(substr(cnn_segs$start_time,1,10))

#remove any overlapping dates
cnn_segs <- cnn_segs[!duplicated(cnn_segs),]

#check that we're getting the right segments
cnn_segs$tst <- as.numeric(grepl("immigr|illegals|illegal alien", cnn_segs$txt, ignore.case = T))

mean(cnn_segs$tst)

#drop misses
cnn_segs<- cnn_segs[cnn_segs$tst==1,]

cnn_segs$duration <- (cnn_segs$stop - cnn_segs$start)/60

#average segment is .77 mins long
mean(cnn_segs$duration)

### FOX ###

#get data from multiple pulls
fox0 <- fromJSON(file="fox_immigr_apr_sept2019.json")
fox1 <- fromJSON(file="fox_immigr_2019.json")
fox2 <- fromJSON(file="fox_immigr_2018.json")
fox3 <- fromJSON(file="fox_immigr.json")

fox <- c(fox0,fox1,fox2,fox3)

rm(fox0,fox1,fox2,fox3)

pb <- txtProgressBar(min = 0, max = length(fox), style = 3)

fox_segs <- NULL

for(j in 1:length(fox)){
  tmp <- fox[[j]] #each program
  
  #unlist to df
  df <- data.frame(cc=unlist(str_split(tmp$cc,"\n"))[1:length(tmp$times)],times=tmp$times)
  
  colnames(df)<- c("cc","times")
  
  #get times of immigration mentions
  df$immigr <- as.numeric(grepl("immigr|illegals|illegal alien",df$cc,ignore.case=T))
  
  times <- df$times[which(df$immigr==1)]
  
  #immigration segment = at least 2 immigration mentions w/in 60 sec
  
  start <- times[1]
  stop <- times[1]
  
  # get segment start and stop times
  segs <- NULL
  if(length(times)>1){
    for(i in 1:length(times)){
      if(times[i]-stop<=60){
        stop <- times[i]
      } else {
        segs <- rbind(segs,c(start,stop))
        start <- times[i]
        stop <- times[i]
      }
    }
    segs <- as.data.frame(rbind(segs,c(start,stop)))
    colnames(segs) <- c("start","stop")
    
    #add program metadata
    segs$start_time <- tmp$start_localtime
    segs$program <- tmp$program
    segs$title <- tmp$title
    
    #get segment text
    segs$txt <- rep(NA, nrow(segs))
    for(k in 1:nrow(segs)){
      segs$txt[k] <- paste0(df$cc[df$times>=segs$start[k]&df$times<=segs$stop[k]], collapse=" ")
    }
    
    #bind segments
    fox_segs <- rbind(fox_segs, segs)
  }
  setTxtProgressBar(pb, j) 
}

rm(fox)

#drop anything with fewer than 2 mentions w/in a minute
fox_segs <- fox_segs[fox_segs$start!=(fox_segs$stop),]

#was trump mentioned in the segment?
fox_segs$trump <- as.numeric(grepl("trump", fox_segs$txt, ignore.case=T))

fox_segs$channel <- "fox"
fox_segs$date <- as.Date(substr(fox_segs$start_time,1,10))

#remove any overlapping dates
fox_segs <- fox_segs[!duplicated(fox_segs),]

#check that we're getting the right segments
fox_segs$tst <- as.numeric(grepl("immigr|illegals|illegal alien", fox_segs$txt, ignore.case = T))

mean(fox_segs$tst)

#drop misses
fox_segs<- fox_segs[fox_segs$tst==1,]

fox_segs$duration <- (fox_segs$stop - fox_segs$start)/60

#average segment is .82 mins long
mean(fox_segs$duration)

tot <- rbind(msnbc_segs, fox_segs, cnn_segs)

#save the file for easier access later
setwd(here("output"))
write.csv(tot, file="tot_immigr_segs.csv")

###########################
###########################
# RUN MODEL
###########################
###########################

setwd(here("release_data","TopicModelInputs"))

#tot <- fread("tot_immigr_segs.csv", data.table=F, header=T)

agg_date <- aggregate(tot$duration, list(tot$date, tot$channel), sum)

colnames(agg_date)<- c("date","channel","mins")

#total mins of immigration coverage
agg_date$date <- as.Date(agg_date$date)
ggplot(agg_date, aes(x=date, y=mins, color=channel))+geom_point(alpha=0.1)+stat_smooth(se=F)+scale_color_manual(values=c("magenta","firebrick3","royalblue3"))+geom_vline(xintercept=as.Date("2015-06-16"), linetype="dashed")+geom_vline(xintercept=as.Date("2017-01-20"), linetype="dashed")+geom_label(x=as.Date("2014-08-01"), y=150, label="Pre-Campaign", size=3,show.legend = FALSE, color="black")+geom_label(x=as.Date("2016-05-01"), y=150, label="Campaign", size=3,show.legend = FALSE, color="black")+geom_label(x=as.Date("2018-06-01"), y=150, label="Post-Inauguration", size=3,show.legend = FALSE, color="black")+theme(legend.position = "bottom",legend.title = element_blank(),axis.title=element_text(size=8))+xlab(NULL)+ylab("Immigrantion News \n Coverage (Mins)")


#### RUNNING TOPIC MODEL ####

#using an stm conditioning on channel

tot$post_election <- ifelse(tot$date>=as.Date("2017-01-20"),1,0)
tot$post_trump <- ifelse(tot$date>=as.Date("2015-06-16")&tot$date<=as.Date("2017-01-20"),1,0)


tot$time <- "pre-election"
tot$time[tot$post_trump==1] <- "election"
tot$time[tot$post_election==1] <- "post-election"

tot$time <- relevel(factor(tot$time), ref="pre-election")
#using an stm conditioning on channel + post election


#remove all segments with ancestry.com ads

tot <- tot[grep("Ancestry.com",tot$txt, invert = T),]

#used the words selected on (immigr|illegals|illegal alien) as custom stopwords
tot$txt <- gsub("ILLEGAL ALIEN","ILLEGALALIEN",tot$txt, ignore.case = T)
processed <- textProcessor(tot$txt, metadata =tot, customstopwords =c("immigrant","immigrants","immigrate","immigrated","immigrating","immigration","illegals","illegalalien","illegalaliens"))

out <- prepDocuments(processed$documents, processed$vocab, processed$meta)

immigrFit <- stm(documents = out$documents, vocab = out$vocab, K = 30, prevalence =~ time + channel + time*channel,data = out$meta)

setwd(here("output"))
save.image(file = "TopicModel.RData")